_Code snippets/fixest_snippets.R

#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Tue Sep 22 15:30:09 2020
# ~: Code snippets to share
#----------------------------------------------#


####
#### Sparse model.matrix ####
####

# Here's a quick and dirty (hence limited) code to create a sparse model.matrix from
# a fixest estimation

# What does it do? It returns a list of two elements:
# - mat_RHS: the *sparse* version of the RHS (excluding the FEs)
# - mat_FE: the sparse matrix of the fixed-effects
# NOTE: both are Matrix matrices from library(Matrix)

# Limitations:
# - does not automatically remove references from factors
# - does not handle estimations containing variables with varying slopes
#

# Benefits (yes there are some!):
# - avoids copies of the data in the process of constructing the matrix
# - handles interactions between any number of factors/numeric variables
# - the non-sparse model matrix is never created, hence it's very fast
#

# Be sure to run the *three* functions sparse_model_matrix, mult_wrap and mult_sparse


# x: fixest estimation
sparse_model_matrix = function(x){

    require(Matrix)

    # Linear formula
    fml_lin = formula(x, "lin")

    data = fixest:::fetch_data(x)
    data = as.data.frame(data)
    data = data[obs(x), ]

    #
    # Step 1: Linear matrix
    #

    vars = attr(terms(fml_lin), "term.labels")

    if(length(vars) == 0){
        # Case only FEs
        mat = NULL
    } else {

        # Since we don't want to evaluate the factors,
        # the code is a bit intricate because we have to catch them before
        # any interaction takes place
        #
        # that's why I wrap interactions in a function (mult_sparse())
        #

        # Below, we evaluate all the variables in a "sparse" way

        vars_calls = lapply(vars, mult_wrap)

        n = length(vars)
        variables_list = vector("list", n)
        for(i in 1:n){
            variables_list[[i]] = eval(vars_calls[[i]], data)
        }

        # To create the sparse matrix, we need the indexes

        total_cols = 0
        running_cols = c(0)
        for(i in 1:n){
            xi = variables_list[[i]]
            if(inherits(xi, "sparse_var")){
                total_cols = total_cols + xi$n_cols
            } else {
                total_cols = total_cols + NCOL(xi)
            }
            running_cols[i + 1] = total_cols
        }

        # We just create a sparse matrix and fill it

        # 1) creating the indexes + names

        # NOTA: I use lists to avoid creating copies
        rowid = 1:nrow(data)
        id_all = values_all = names_all = vector("list", n)
        for(i in 1:n){
            xi = variables_list[[i]]
            if(inherits(xi, "sparse_var")){
                id_all[[i]] = cbind(xi$rowid, running_cols[i] + xi$colid)
                values_all[[i]] = xi$values
                names_all[[i]] = paste0(vars[[i]], "::", xi$col_names)
            } else if(NCOL(xi) == 1){
                id_all[[i]] = cbind(rowid, running_cols[i] + 1)
                values_all[[i]] = xi
                names_all[[i]] = vars[[i]]
            } else {
                colid = rep(1:NCOL(xi), each = nrow(data))
                id_all[[i]] = cbind(rep(rowid, NCOL(xi)), running_cols[i] + colid)
                values_all[[i]] = as.vector(xi)
                if(!is.null(colnames(xi))){
                    names_all[[i]] = paste0(vars[[i]], colnames(xi))
                } else {
                    names_all[[i]] = paste0(vars[[i]], 1:NCOL(xi))
                }
            }
        }

        id_mat = do.call(rbind, id_all)
        values_vec = unlist(values_all)
        names_vec = unlist(names_all)

        # 2) filling the matrix: one shot, no copies

        mat = Matrix(0, nrow(data), total_cols, dimnames = list(NULL, names_vec))
        mat[id_mat] = values_vec
    }

    #
    # Step 2: the fixed-effects
    #

    if(length(x$fixef_id) == 0){
        mat_FE = NULL
    } else {
        # Same process, but easier

        rowid = 1:nrow(data)
        total_cols = sum(x$fixef_sizes)
        running_cols = c(0, x$fixef_sizes)
        n_FE = length(x$fixef_sizes)
        id_all = names_all = vector("list", n_FE)
        for(i in 1:n_FE){
            xi = x$fixef_id[[i]]
            id_all[[i]] = cbind(rowid, running_cols[i] + xi)
            names_all[[i]] = paste0(names(x$fixef_id)[i], "::", attr(xi, "fixef_names"))
        }

        id_mat = do.call(rbind, id_all)
        names_vec = unlist(names_all)

        mat_FE = Matrix(0, nrow(data), total_cols, dimnames = list(NULL, names_vec))
        mat_FE[id_mat] = 1
    }

    res = list(mat_RHS = mat, mat_FE = mat_FE)

    res
}

# Internal: modifies the calls so that each variable/interaction is evaluated with mult_sparse
mult_wrap = function(x){
    # x: character string of a variable to be evaluated
    # ex: "x1" => mult_sparse(x1)
    #     "x1:factor(x2):x3" => mult_sparse(x3, factor(x2), x1)
    #
    # We also add the argument sparse to i()
    #     "x1:i(species, TRUE)" => mult_sparse(x1, i(species, TRUE, sparse = TRUE))

    x_call = str2lang(x)

    res = (~ mult_sparse())[[2]]

    if(length(x_call) == 1 || x_call[[1]] != ":"){
        res[[2]] = x_call

    } else {
        res[[2]] = x_call[[3]]
        tmp = x_call[[2]]

        while(length(tmp) == 3 && tmp[[1]] == ":"){
            res[[length(res) + 1]] = tmp[[3]]
            tmp = tmp[[2]]
        }

        res[[length(res) + 1]] = tmp
    }

    # We also add sparse to i() if found
    for(i in 2:length(res)){
        ri = res[[i]]
        if(length(ri) > 1 && ri[[1]] == "i"){
            ri[["sparse"]] = TRUE
            res[[i]] = ri
        }
    }

    if(length(res) > 2){
        # we restore the original order
        res[-1] = rev(res[-1])
    }

    return(res)
}

# Internal function to evaluate the variables (and interactions) in a sparse way
mult_sparse = function(...){
    # Only sparsifies factor variables
    # Takes care of interactions

    dots = list(...)
    n = length(dots)

    num_var = NULL
    factor_list = list()
    info_i = NULL
    is_i = is_factor = FALSE
    # You can't have interactions between i and factors, it's either

    for(i in 1:n){
        xi = dots[[i]]
        if(is.numeric(xi)){
            # We stack the product
            num_var = if(is.null(num_var)) xi else xi * num_var
        } else if(inherits(xi, "i_sparse")){
            is_i = TRUE
            info_i = xi
        } else {
            is_factor = TRUE
            factor_list[[length(factor_list) + 1]] = xi
        }
    }

    if(!is_i && !is_factor){
        return(num_var)
    }

    if(is_factor){
        factor_list$add_items = TRUE
        factor_list$items.list = TRUE

        fact_as_int = do.call(to_integer, factor_list)

        values = if(is.null(num_var)) rep(1, length(fact_as_int$x)) else num_var

        rowid = seq_along(values)
        res = list(rowid = rowid, colid = fact_as_int$x, values = values,
                   col_names = fact_as_int$items, n_cols = length(fact_as_int$items))
    } else {

        values = info_i$values
        if(!is.null(num_var)){
            num_var = num_var[info_i$rowid]
            values = values * num_var
        }

        res = list(rowid = info_i$rowid, colid = info_i$colid, values = values,
                   col_names = info_i$col_names, n_cols = length(info_i$col_names))
    }

    class(res) = "sparse_var"

    res
}

library(fixest)
est = feols(mpg ~ poly(hp, 2) + as.factor(gear)*as.factor(carb) + i(am) | cyl, mtcars)

sparse_model_matrix(est)


####
#### Marginal effects ####
####

# Quick and Dirty implementation of marginal effects
# Very limited but does the job. Easy to expand.

require(dreamerr) ; require(fixest)
meffect = function(x, at_means = TRUE, vcov, se, cluster, ...){
    # x: fixest object

    check_arg(x, "class(fixest) mbt")
    check_arg(at_means, "logical scalar")
    if(isFALSE(at_means)) stop("Sorry, so far only the marginal effects at means is available.")

    is_FE = FALSE
    if(!is.null(x$fixef_vars)){
        if(!x$method_type == "feglm"){
            stop("Models with fixed-effects are not yet supported for method ", x$method, ", use factors instead (if possible).")
        }
        is_FE = TRUE
    }

    if(!isTRUE(x$summary) || !missing(se) || !missing(cluster)){
        x = summary(x, vcov = vcov, se = se, cluster = cluster, ...)
    }

    coef = x$coefficients
    vars = names(coef)

    vcov = x$cov.scaled

    if(!is_FE){
        m = model.matrix(x)
        m_mean = colMeans(m[, vars])

    } else {
        # x => feglm
        if(x$family$family %in% c("poisson", "binomial")){
            m = x$scores / (x$working_residuals * x$irls_weights)
        } else {
            m = x$scores / (x$working_residuals * x$irls_weights / x$dispersion)
        }

        m_mean = colMeans(m)
    }

    mu_mean = mean(predict(x, type = "link"))

    # 1) the SE
    # 2) the ME with formatting

    # 1) The standard errors via the delta method (at means)
    if(x$method_type == "feols" || (x$method_type == "feNmlm" && x$family == "gaussian")){
        d_f = function(x) 1
        dd_f = function(x) 0
    } else if((x$method_type == "feNmlm" && x$family == "poisson") || (x$method_type == "feglm" && x$family$family == "poisson")){
        d_f = function(x) exp(x)
        dd_f = function(x) exp(x)
    } else if(x$method_type == "feglm" && x$family$family == "binomial" && x$family$link == "probit"){
        d_f = dnorm
        dd_f = function(x) 1/(sqrt(2*pi)) * -x * exp(-x**2/2)
    } else {
        stop("Current family is not supported.")
    }

    # jacobian
    jac = tcrossprod(coef, m_mean) * dd_f(mu_mean)
    diag(jac) = diag(jac) + d_f(mu_mean)
    vcov_new = jac %*% vcov %*% t(jac)

    sd = sqrt(diag(vcov_new))
    value = coef * d_f(mu_mean)
    z = value/sd

    res = data.frame(var = vars, dydx = value, se = sd, z = z, pvalue = 2 * pnorm(-abs(z)), CI_95_low = value - 1.96*sd, CI_95_high = value + 1.96*sd, row.names = seq_along(vars))

    res
}

#
# Example
#

# Note:
# - check with Stata's: margins, dydx(x1 x2) atmeans

base = iris
names(base) = c("y", "x1", "x2", "x3", "species")

# I set the default SE to "iid" for comparability with Stata
setFixest_vcov(all = "iid")

res = fepois(y ~ x1 + x2, base)
meffect(res)

res = feglm(1*(y > 5) ~ x1 + x2, base, family = binomial("probit"))
meffect(res)

#
# FE example

# With fixed-effects as factor
res_lin = fepois(y ~ x1 + x2 + species, base)
meffect(res_lin)

# With fixed-effects absorbed
res_fe = fepois(y ~ x1 + x2 | species, base)
meffect(res_fe)




####
#### Imai & Song ####
####

# Equivalence between the "unit" wfe estimation and the same estimation in fixest
# A) function to compute the unit weights
# B) comparaison of the estimations
#
# other:
#  - AJPS 2019 paper: https://imai.fas.harvard.edu/research/files/FEmatch.pdf
#  - original code for unit weights (function GenWeightsUnit, line 1043): https://github.com/insongkim/wfe/blob/master/src/wfe.c

#
# A) weights
#

# Function to compute the weights (with full checks)

require(dreamerr) ; require(data.table)
IS_weights_unit = function(index, base){

    # ARGUMENT CHECK #

    check_arg(index, "mbt data.frame ncol(2) | os formula | character vector len(2)",
              .message = "The argument 'index' must contain the a) unit and b) the treatment variables. It can be either: i) a data.frame of two columns, ii) a one sided formula, or iii) a character vector of length 2.")

    if(!is.data.frame(index)){
        check_arg(base, "data.frame mbt", .message = "If the argument 'index' is not a data.frame, you must provide a data.frame in the argument 'base'.")

        if("formula" %in% class(index)){
            check_arg(index, "formula var(data)", .data = base)

            index = model.frame(index, base)

            check_arg(index, "data.frame ncol(2)", .message = "If a formula, then 'index' must lead to a data.frame with two variables.")

        } else {
            # flexible matching of names
            check_set_arg(index, "multi match", .choices = names(base), .message = "If the argument 'index' is a character vector, it must match (at least partially) the names of the data.frame in argument 'base'.")

            index = as.data.frame(base)[, index]
        }

    }

    # Here 'index' is a data.frame with two variables
    res = as.data.table(index)
    names(res) = c("unit", "treatment")

    res[, n_treated := sum(treatment), by = unit]
    res[, n_not_treated := sum(1 - treatment), by = unit]
    res[, m_size := treatment * n_not_treated + (1 - treatment) * n_treated]
    res[, w := 1 + treatment * (m_size / n_treated) + (1 - treatment) * (m_size / n_not_treated)]
    res[is.na(w), w := 0]

    res$w
}


#
# B) Estimations
#

library(fixest) ; library(wfe)

# Data preparation

data(base_did)
set.seed(0)
base = data.table(base_did)
# we trim first/last periods of some guys to add variation in the weights (otherwise: all equal to 2 or 0)
id_first = sample(108, 20) ; id_last = sample(108, 20)
base = base[!(id %in% id_first & period <= 3) & !(id %in% id_last & period >= 7)]
base[, treat_post := treat*post]

# Estimations

system.time(res_wfe <- wfe(y ~ treat_post + x1, data = base, treat = "treat_post", unit.index = "id", time.index = "period", method = "unit",
                           qoi = "ate", hetero.se=TRUE, auto.se=TRUE))


system.time(res_feols <- feols(y ~ treat_post + x1 | id, base, weights = w <- IS_weights_unit(~ id + treat_post, base)))

# Comparing the weights
all.equal(w, summary(res_wfe)$W$W.it)

# Timings on my laptop:
#    wfe: 320ms
# fixest:  10ms

# Coefficients and SEs:
rbind(fixest = coef(res_feols), wfe = coef(res_wfe))
rbind(fixest = se(res_feols), wfe = se(res_wfe))


####
#### F-test: 2-models comparion ####
####

f_test = function(m1, m2 = NULL) {
   # F test between two models
   # if the second model is missing and the 
   # first model has fixed-effects, then the tests
   # is the comparison without FEs

   if(missing(m2) || is.null(m2)){
      if(!is.null(m1$fixef_id)){
         m2 = update(m1, . ~ . | 1)
      }
   }

   if(m1$ssr < m2$ssr){
      # we revert the two models
      tmp = m2
      m2 = m1
      m1 = tmp
   }

   ssr_c = m1$ssr
   ssr_u = m2$ssr

   dfr_c = degrees_freedom(m1, "resid", vcov = "iid")
   dfr_u = degrees_freedom(m2, "resid", vcov = "iid")

   n_restriction = dfr_c - dfr_u
   f_stat = (ssr_c - ssr_u) / ssr_u * dfr_u / n_restriction
   pval = 1 - pf(f_stat, n_restriction, dfr_u)

   data.frame(f_stat, pval)
}

m = feols(Petal.Length ~ Sepal.Length | Species, iris)
mm = feols(Petal.Length ~ Sepal.Length + Sepal.Width | Species, iris)

f_test(m, mm)
#>     f_stat      pval
#> 1 0.250248 0.6176589

m = lm(Petal.Length ~ Sepal.Length + Species, iris)
mm = lm(Petal.Length ~ Sepal.Length + Sepal.Width + Species, iris)

anova(m, mm)
#> Analysis of Variance Table
#> Model 1: Petal.Length ~ Sepal.Length + Species
#> Model 2: Petal.Length ~ Sepal.Length + Sepal.Width + Species
#>   Res.Df    RSS Df Sum of Sq      F Pr(>F)
#> 1    146 11.657
#> 2    145 11.637  1  0.020084 0.2502 0.6177

# comparison with no FE:
m = feols(Petal.Length ~ Sepal.Length | Species, iris)
f_test(m)
#>     f_stat pval
#> 1 624.9854    0
lrberge/fixest documentation built on Sept. 1, 2024, 9:07 a.m.